---
title: "2019 5-Yr ACS"
author: "melissa_barales"
output: pdf_document
---

```{r}
setwd("~/Desktop/Senior Thesis/Census")
```

```{r}
#Uploadding 2015-2019 ACS 5-year estimates
library(haven)
library(dplyr)
library(tidyverse)
dat <- read_dta("usa_00023.dta")
```

```{r}
names(dat)
```

```{r}
# Converting labelled vector to factor variable
dat$language <- haven::as_factor(dat$language)
```

```{r}
class(dat$language)
```
```{r}
#viewing all present languages
table(dat$language, useNA = "always")
```

```{r}
#Creating new ethnicity variable 
dat$ethnicity <- NA
```

**Creating new ethnic categories** 

*Chinese*

```{r}
dat$ethnicity[dat$language=="chinese" | dat$bpl_mom==500] <- "Chinese"
```

*Filipno*
```{r}
dat$ethnicity[dat$language=="filipino, tagalog" | dat$bpl_mom==515] <- "Filipino"
```

*Korean*
```{r}
dat$ethnicity[dat$language=="korean" | dat$bpl_mom==502] <- "Korean"
```

*Japanese*
```{r}
dat$ethnicity[dat$language=="japanese"] <- "Japanese"
```

*Portuguese*
```{r}
dat$ethnicity[dat$language=="portuguese"] <- "Portuguese"
```

*Vietnamese*
```{r}
dat$ethnicity[dat$language=="vietnamese" | dat$bpl_mom==518] <- "Vietnamese"
```

*Thai*
```{r}
dat$ethnicity[dat$language=="thai, siamese, lao"] <- "Thai"
```

*Yiddish*
```{r}
dat$ethnicity[dat$language=="yiddish, jewish"] <- "Yiddish"
```

```{r}
#Grouping all ntions within the Middle East whose official language is Arabic
dat$bpl_mom[dat$bpl_mom==532 | dat$bpl_mom==535 | dat$bpl_mom==536 |
              dat$bpl_mom==537 | dat$bpl_mom==540 | dat$bpl_mom==541 | dat$bpl_mom==543 |
              dat$bpl_mom==544] <- 2000
```

*Arabic*
```{r}
dat$ethnicity[dat$language=="arabic" | 
                dat$language=="near east arabic dialect" | 
                dat$bpl_mom==2000] <- "Arabic"
```

*German*
```{r}
dat$ethnicity[dat$language=="german" | dat$bpl_mom==453] <- "German"
```

*Dutch*
```{r}
dat$ethnicity[dat$language=="dutch"] <- "Dutch"
```

*Italian*
```{r}
dat$ethnicity[dat$language=="italian"] <- "Italian"
```

*French*
```{r}
dat$ethnicity[dat$languaged==1100 | dat$bpl_mom== 421] <- "French"
```

*Haitian*
```{r}
dat$ethnicity[dat$languaged==1140] <- "Haitian"
```

*Rumanian*
```{r}
dat$ethnicity[dat$language=="rumanian"] <- "Rumanian"
```

*Greek*
```{r}
dat$ethnicity[dat$language=="greek"] <- "Greek"
```

*Albanian*
```{r}
dat$ethnicity[dat$language=="albanian"] <- "Albanian"
```

*West Slavic*
```{r}
dat$ethnicity[dat$language=="polish" | dat$language=="slovak" |
                dat$language=="czech"] <- "West Slav"
```

*East Slavic*
```{r}
dat$ethnicity[dat$language=="russian" | 
                dat$language=="ukrainian, ruthenian, little russian" | 
                dat$bpl_mom==465] <- "East Slav"
```

*Sotuh Slavic*
```{r}
dat$ethnicity[dat$language=="serbo-croatian, yugoslavian, slavonian"  | 
                dat$language=="other balto-slavic"] <- "South Slav"
```

*Navajo*
```{r}
dat$ethnicity[dat$language=="navajo"] <- "Navajo"
```

*Armenian*
```{r}
dat$ethnicity[dat$language=="armenian"] <- "Armenian"
```

*Turkish*
```{r}
dat$ethnicity[dat$language=="turkish"] <- "Turkish"
```

*Dravidian*
```{r}
dat$ethnicity[dat$languaged==4003 | dat$languaged==4004 | 
                dat$languaged==4005] <- "Dravidian"
```

*Tibetan*
```{r}
dat$ethnicity[dat$language=="tibetan"] <- "Tibetan"
```

*Burmese*
```{r}
dat$ethnicity[dat$language=="burmese, lisu, lolo"] <- "Burmese"
```

*Cambodian*
```{r}
dat$ethnicity[dat$language=="other east/southeast asian"] <- "Cambodian"
```

*Indonesian*
```{r}
dat$ethnicity[dat$language=="indonesian" | 
                dat$language=="other malayan"] <- "Indonesian"
```

*Polynesian*
```{r}
dat$ethnicity[dat$language=="micronesian, polynesian" | 
                dat$language=="hawaiian"] <- "Polynesian"
```

*Hebrew/Israeli*
```{r}
dat$ethnicity[dat$language=="hebrew, israeli"] <- "Hebrew"
```

*Scandinavian*
```{r}
dat$ethnicity[dat$language=="swedish" | dat$language=="danish" |
                dat$language=="norwegian"] <- "Scandinavian"
```

*Persian*
```{r}
dat$ethnicity[dat$language=="persian, iranian, farsi"] <- "Persian"
```

*Hindi*
```{r}
dat$ethnicity[dat$language=="hindi and related" | dat$languaged==4011 |
                dat$bpl_mom==521] <- "Hindi"
```

*East African*
```{r}
dat$ethnicity[dat$language=="amharic, ethiopian, etc" | 
                dat$language == "hamitic"] <- "East African"
```

*Niger-Congolese*
```{r}
dat$ethnicity[dat$languaged==6321] <- "Niger-Congolese"
```

*English-speaking Americans*
```{r}
dat$ethnicity[dat$languaged==100] <- "American"
```

*Values under 5000 obvs*
```{r}
#grouping all languages under 5000 observations
dat$ethnicity[dat$language=="icelandic"  | dat$language=="scandinavian" | 
                dat$language=="celtic" | 
                    dat$language=="slovene" | 
                    dat$language=="slavic unknown" | 
                    dat$language=="romany, gypsy" | dat$language=="finnish" | 
                dat$language=="magyar, hungarian" | 
                    dat$language=="uralic" | 
                    dat$language=="caucasian, georgian, avar" | 
                    dat$language=="basque" | 
                dat$language=="kurukh" | dat$language=="burushaski" | 
                dat$language=="native" |
                    dat$language=="kachin" | dat$language=="african, n.s" |
                    dat$language=="american indian (all)" |  
                dat$language=="aleut, eskimo" | 
                    dat$language=="algonquian" | dat$language=="salish, flathead" | 
                dat$language=="athapascan" |
                    dat$language=="penutian-sahaptin" | 
                dat$language=="other penutian" | 
                dat$language=="zuni" |
                    dat$language=="yuman" | dat$language=="other hokan languages" | 
                dat$language=="siouan languages" | 
                    dat$language=="muskogean" | dat$language=="keres" | 
                dat$language=="iroquoian" | dat$language=="other altaic" | 
                    dat$language=="caddoan" | dat$language=="shoshonean/hopi" | 
                dat$language=="pima, papago" | 
                    dat$language=="	yaqui and other sonoran, nec" |
                    dat$language=="aztecan, nahuatl, uto-aztecan" |
                dat$language=="tanoan languages" | 
                    dat$language=="other indian languages" | 
                dat$language=="lithuanian" |
                    dat$language=="mayan languages" | 
                      dat$language=="american indian, n.s." | 
                    dat$language=="no language" | 
                dat$language=="other or not reported" | 
                dat$language=="other persian dialects" | 
                dat$language=="other afro-asiatic languages" |
                    dat$language=="not reported, blank" |
                dat$languaged==110 | dat$languaged==170 |
                dat$languaged==6304 | dat$languaged==6307 |
                dat$languaged==6308 |
                dat$languaged==6309 | dat$languaged==6310 |
                dat$languaged==6390 | dat$languaged==1150] <- "Other"
```

*NAs*
```{r}
dat$ethnicity[dat$language=="n/a or blank"] <- NA
```

```{r}
table(dat$ethnicity, useNA = "always")
```

*Hispanic*

```{r}
#Also adding in those observations with mothers born in Latin America 
dat$bpl_mom[dat$bpl_mom==200 | dat$bpl_mom== 210 | dat$bpl_mom== 250 |
                dat$bpl_mom== 260 | dat$bpl_mom==299 |
                dat$bpl_mom== 300 | dat$bpl_mom==110 | 
                dat$bpl_mom== 438] <- 1000
```

```{r}
#Defined as those who speak Spanish or have a mother born in a Hispanic country 
dat$ethnicity[dat$language == "spanish" | dat$bpl_mom==1000] <- "Hispanic"
```

*Proportions of ethnic background*

```{r}
table(dat$ethnicity, useNA = "always")
```

```{r}
#no values are unaccounted for in the ethnicity variable
```

```{r}
prop.table(table(dat$ethnicity)) * 100
```

```{r}
dat2 <- dat
```

```{r}
#Excludig all NAs in ethnicity 
dat <- dat %>%
  dplyr::filter(!(is.na(ethnicity)))
```

```{r}
table(dat$ethnicity, useNA = "always")
```

**PUMA**

```{r}
class(dat$statefip)
```

```{r}
dat$statefip_name <- haven::as_factor(dat$statefip)
```

```{r}
class(dat$statefip_name)
```

```{r}
#Merging puma and statefip variable together into "puma_2"
```

```{r}
#creating a PUMA identifier 
dat <- dat %>%
  unite("puma_2", statefip, puma, sep= "-", 
        remove = FALSE)
```

```{r}
#Creating a unique_id idenitifier
dat <- dat %>%
  unite("unique_id", sample, serial, pernum, sep= "-", 
        remove = FALSE)

```

```{r}
#Calculating proporiton of ethnicities by puma_2
sums <- dat %>% 
  group_by(puma_2, ethnicity) %>% 
  dplyr::summarize(persons=sum(perwt, na.rm = TRUE)) %>% 
  dplyr::mutate(prop=persons/sum(persons, na.rm = TRUE)) %>%
  dplyr::mutate(rank=min_rank(x=-prop)) %>%
  dplyr::arrange(puma_2, rank)
```

```{r}
#Subsetting pumas to those that have 30% of a given ethnicity or more
prop_over30 <- subset(sums, sums$prop>=0.30)  
```

```{r}
table(prop_over30$ethnicity,useNA = "always")
``` 

```{r}
#Subsetting pumas to those that have below 30% of a given ethnicity
prop_under30 <- subset(sums, sums$prop<0.30)
```

```{r}
#Exporting table of ethnicity proportions by puma
write.csv(sums, "~/Desktop/Senior Thesis/Census/ethnicity_by_puma.csv", row.names = FALSE)
```

```{r}
#narrowing down to Hispanic enclaves that make up more 
#than 30% of the PUMA's popualtion 
#exporting dataset of purely Hispanic enclaves 
Hisp_enclaves <- sums %>% filter(ethnicity=="Hispanic" & prop>= .30)
```

```{r}
write.csv(Hisp_enclaves, "~/Desktop/Senior Thesis/Census/Hisp_enclaves.csv")
```

#EXPORTING UPDATED DATASET

```{r}
merged_ACS <- dat
```

```{r}
saveRDS(merged_ACS, "~/Desktop/Senior Thesis/Census/Merged_ACS.rds")
```

```{r}
names(merged_ACS)
```














